# Replication materials for Clause Analysis (Van Atteveldt et al., Political Analysis)
#
# Code to create the figures and tables from section 5. SUBSTANTIVE USE CASE
# This code requires the clauses.rds, quotes.rds and entities.rds files created in get_tokens.r

source("functions.r")
clauses = readRDS("clauses.rds")

agcl = unique(clauses$clause_id[clauses$attack])

# Table 3: Who is quoted?
x = unique(clauses[ c("clause_id", "country", "source")])
x = unique(clauses[!is.na(clauses$source), c("clause_id", "country", "source")])
nn = table(x$country,  x$source)
nn
nn[,2] / nn[,1]
chisq.test(nn)

x = unique(clauses[!is.na(clauses$source) & (clauses$clause_id %in% agcl), c("clause_id", "country", "source")])
nn = table(x$country,  x$source)
nn
nn[,2] / nn[,1]
chisq.test(nn)



# Table 4: who is aggressor / victim?

acl = unique(clauses[clauses$actor != "" & clauses$clause_id %in% agcl, c("clause_id", "clause_role", "country", "actor")])

nn.us = with(subset(acl, country=="us"), table(actor, clause_role))
nn.us
nn.us / rowSums(nn.us)

nn.cn = with(subset(acl, country=="cn"), table(actor, clause_role))
nn.cn / rowSums(nn.cn)

with(subset(acl, actor=="Israel"), chisq.test(country, clause_role))
with(subset(acl, actor=="Hamas"), chisq.test(country, clause_role))

chisq.test(nn)


# Replication script for Figures 3 and 4
# Note that these are not deterministic, so figure might not be identical to printed version

# Install with devtools::install_github("kasperwelbers/corpus-tools)
library(corpustools)
# Install with devtools::install_github("kasperwelbers/semnet)
library(semnet)


# Figure 4: what do hamas/israel do according to china / israel

stopwords = read.csv("stop-word-list.csv", header =F, stringsAsFactors=F)[[1]]
p = subset(clauses, clause_role == "predicate" & pos1 %in% c('V','N','M','A') & !(lemma %in% stopwords))
predicates = dtm.create(p$clause_id, p$lemma, filter.chars = F, minlength = 2)

cusa = unique(clauses$clause_id[clauses$country == "us"])
cchina = unique(clauses$clause_id[clauses$country == "cn"])
sh = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Hamas"])
si = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Israel"])

# N
c(length(intersect(cusa, si)), length(intersect(cchina, si)), length(intersect(cusa, sh)), length(intersect(cchina, sh)))

plot.cooc.from.dtm(predicates, intersect(cusa, si), intersect(cchina, si))
plot.cooc.from.dtm(predicates, intersect(cchina, si), intersect(cusa, si))
plot.cooc.from.dtm(predicates, intersect(cusa, sh), intersect(cchina, sh))
plot.cooc.from.dtm(predicates, intersect(cchina, sh), intersect(cusa, sh))


# Figure 3: Who talks about whom?

entities = readRDS("entities.rds")
quotes = readRDS("quotes.rds")

# N
tapply(quotes$quote_id, quotes$country, function(x) length(unique(x)))

qnames = na.omit(merge(quotes, subset(entities, name == "Israel" | type %in% c("ORGANIZATION", "PERSON")))[c("country", "quote_id", "quote_role", "name")])
srcs = qnames[qnames$quote_role == "source", c("country", "quote_id", "name")]
qts = qnames[qnames$quote_role == "quote", c("country", "quote_id", "name")]

cmp = dcast(srcs, name ~ country, fun.aggregate = length, value.var="quote_id")
smooth = .000001
cmp$relcn = cmp$cn / sum(cmp$cn)
cmp$relus = cmp$us / sum(cmp$us)
cmp$overus = (cmp$relus + smooth) / (cmp$relcn + smooth)
cmp$chi = with(cmp, corpustools:::chi2(us, cn, sum(us) - us, sum(cn) - cn))

x = merge(srcs, qts, by=c("quote_id", "country"))
x = aggregate(list(n=x$quote_id), by=x[c("name.x", "name.y", "country")], FUN=function(x) length(unique(x)))

g.cn = graph.data.frame(subset(x, n>=1 & country == "cn" & (name.x %in% cmp$name[cmp$overus < 1]  |name.y %in% cmp$name[cmp$overus < 1])), directed = T)
V(g.cn)$weight = cmp$chi[match(get.vertex.attribute(g.cn, "name"), cmp$name)]
g.cn = graph.settings(g.cn, backbone=T)
V(g.cn)$shape = "none"
plot(g.cn, main="Quote network China")

g.us = graph.data.frame(subset(x, country == "us" & (name.x %in% cmp$name[cmp$overus > 1] | name.y %in% cmp$name[cmp$overus > 1])), directed = T)
V(g.us)$weight = cmp$chi[match(get.vertex.attribute(g.us, "name"), cmp$name)]
V(g.us)$weight[is.na(V(g.us)$weight)] = 0.01
g.us = graph.settings(g.us, backbone=T)
V(g.us)$shape = "none"
plot(g.us, main="Quote network US")

